Libraries

Load data

I am using the AHA dataset for writing my code because it is the simplest dataset I can work with. Simplest = significant number of baseline samples without issue of repeated measures, balanced distribution of classes, likely difference to be detected present.

p_ps = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/POMMS/R24_POMMS/data/processed/20221104_pomms_metadata_plant.rds")

p_ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 252 taxa and 384 samples ]
sample_data() Sample Data:       [ 384 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 252 taxa by 8 taxonomic ranks ]
sample_data(p_ps)
Sample Data:        [384 samples by 71 sample variables]:

Preprocess

Remove problematic samples

Taking out samples with less than or equal to 0 reads and keeping only entry timepoint reduces samples from 384 down to 240.

p_ps = p_ps %>%
  subset_samples( reads > 0) %>%
  subset_samples(timepoint == "Entry")

p_ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 252 taxa and 240 samples ]
sample_data() Sample Data:       [ 240 samples by 71 sample variables ]
tax_table()   Taxonomy Table:    [ 252 taxa by 8 taxonomic ranks ]

Class distribution in this dataset: Case = 193 Control = 47

sample_data(p_ps) %>%
  as.data.frame() %>%
  as_tibble() %>%
  group_by(group) %>%
  summarise(count = n())

Data transform

otu_clr = abundances(p_ps, "clr") %>% # rclr transform could not be performed, get Nans
  t()

p_ps = phyloseq(otu_table(otu_clr, taxa_are_rows = FALSE),
                   sample_data(sample_data(p_ps)),
                   tax_table(tax_table(p_ps)))

rm(otu_clr)

Feature table

Extract data

Order of asv colnames in OTU table and tax table is the same.

features = p_ps@otu_table %>%
  as.data.frame() %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("sample")

feature_labels = c("sample")

 names = tax_table(p_ps) %>%
      data.frame() %>%
      lowest_level() %>%
   pull(name)

 feature_labels = append(feature_labels, names)

 feature_labels = make.unique(feature_labels) %>%
   make.names()

 colnames(features) = feature_labels

features

Attach data labels

sample_labels = p_ps@sam_data%>%
  as.data.frame() %>%
  as_tibble(rownames = NA) %>%
  rownames_to_column("sample") %>%
  select(sample, group)

features = sample_labels %>%
  left_join(features)
Joining, by = "sample"

Zero variance features

No features have zero variance

nzv = features %>%
  select(!c(sample,group)) %>%
  nearZeroVar(saveMetrics = TRUE)
rm(nzv)

Correlated predictors

feature_cor = features %>%
  select(!c(sample,group)) %>%
 cor() 

summary(feature_cor[upper.tri(feature_cor)])
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.43611 -0.03523  0.06750  0.13015  0.22513  1.00000 
high_cor = findCorrelation(feature_cor, cutoff = .75)

filtered_features = features[,-high_cor]
rm(high_cor)
rm(feature_cor)

Linear dependencies

feature_ld = filtered_features %>%
  select(!c(sample,group)) %>%
  findLinearCombos()

filtered_features = filtered_features[, - feature_ld$remove]

Input table

Preprocessing reduces column number to 131 from 235

input = filtered_features %>%
  mutate(group_shuffle = sample(group) %>%
           as.factor(),
         group = as.factor(group))

Modeling

Load functions from classification_function_AA file

results = loop_label(iterations =20, input, label_list, percent = 0.7, downsampling = TRUE)
[1] "will downsample on training data"
Warning: There were missing values in resampled performance measures.
[1] "group"
[[1]]
[1] "group"

[[2]]
          Reference
Prediction Case Healthy
   Case     676     125
   Healthy  464     155

[1] "will downsample on training data"
Warning: There were missing values in resampled performance measures.Warning: There were missing values in resampled performance measures.
[1] "group_shuffle"
[[1]]
[1] "group_shuffle"

[[2]]
          Reference
Prediction Case Healthy
   Case     578     142
   Healthy  562     138
rm(label_list)

Plotting

AUC plot and t-test

auc_df %>%
  pivot_longer(cols = c("group", "group_shuffle"), names_to = "feature", values_to = "AUC") %>%
  ggplot()+
  geom_boxplot(aes(x=feature, y = AUC))+
  theme_classic()


t.test(auc_df$group, auc_df$group_shuffle)

    Welch Two Sample t-test

data:  auc_df$group and auc_df$group_shuffle
t = 3.6882, df = 37.743, p-value = 0.0007082
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.04036328 0.13863672
sample estimates:
mean of x mean of y 
   0.5815    0.4920 

Importance plots

look-up table

#Making asv reference for names in model
lookup_asv = tax_table(p_ps) %>% 
     data.frame() %>%
      lowest_level() %>%
   rownames_to_column("asv") %>% 
   select(asv, name) %>%
   mutate(name = make.names(make.unique(name)))

features

summarize importances

top 10 ranked list

top10
 [1] "Helianthus.annuus" "Brassica"          "Solanaceae"       
 [4] "Apiaceae"          "Capsicum.annuum"   "Poaceae"          
 [7] "NA."               "NA.3"              "Lactuca.sativa"   
[10] "Zea.mays"         

Importance plot

importance_df %>%
  pivot_longer(cols = c("NA.":"Apium.graveolens"), names_to = "food", values_to = "importance")  %>%
  filter(variable == "group" & food %in% top10) %>%
  ggplot()+
  geom_boxplot(aes(x= reorder(food,-importance), y = importance))+
  labs(x= "Food", y ="Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

Some of the important NAs for predictions only map to bacterial species in BLAST.

Important taxa directions

lookup_asv %>%
  filter(name == "NA.") %>%
  pull(asv)
[1] "AAATCCTGTTTTATGAAAATAAACAAGGGTTTCATAAACCGAAAATAAAAAAG"
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyBMaWJyYXJpZXMKCmBgYHtyLCBpbmNsdWRlID0gRkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG1pY3JvYmlvbWUpCmxpYnJhcnkocGh5bG9zZXEpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocmFuZG9tRm9yZXN0KQpsaWJyYXJ5KE1CdXRpbHMpCmBgYAoKIyBMb2FkIGRhdGEKCkkgYW0gdXNpbmcgdGhlIEFIQSBkYXRhc2V0IGZvciB3cml0aW5nIG15IGNvZGUgYmVjYXVzZSBpdCBpcyB0aGUgc2ltcGxlc3QgZGF0YXNldCBJIGNhbiB3b3JrIHdpdGguClNpbXBsZXN0ID0gc2lnbmlmaWNhbnQgbnVtYmVyIG9mIGJhc2VsaW5lIHNhbXBsZXMgd2l0aG91dCBpc3N1ZSBvZiByZXBlYXRlZCBtZWFzdXJlcywgYmFsYW5jZWQgZGlzdHJpYnV0aW9uIG9mIGNsYXNzZXMsIGxpa2VseSBkaWZmZXJlbmNlIHRvIGJlIGRldGVjdGVkIHByZXNlbnQuCgoKYGBge3J9CnBfcHMgPSByZWFkUkRTKCIvVXNlcnMvYWEzNzAvTGlicmFyeS9DbG91ZFN0b3JhZ2UvQm94LUJveC9wcm9qZWN0X2RhdmlkbGFiL0xBRF9MQUJfUGVyc29ubmVsL0FtbWFyYV9BL1Byb2plY3RzL1BPTU1TL1IyNF9QT01NUy9kYXRhL3Byb2Nlc3NlZC8yMDIyMTEwNF9wb21tc19tZXRhZGF0YV9wbGFudC5yZHMiKQoKcF9wcwoKc2FtcGxlX2RhdGEocF9wcykKYGBgCgojIFByZXByb2Nlc3MKCiMjIFJlbW92ZSBwcm9ibGVtYXRpYyBzYW1wbGVzCgpUYWtpbmcgb3V0IHNhbXBsZXMgd2l0aCBsZXNzIHRoYW4gb3IgZXF1YWwgdG8gMCByZWFkcyBhbmQga2VlcGluZyBvbmx5IGVudHJ5IHRpbWVwb2ludCByZWR1Y2VzIHNhbXBsZXMgZnJvbSAzODQgZG93biB0byAyNDAuCgpgYGB7cn0KcF9wcyA9IHBfcHMgJT4lCiAgc3Vic2V0X3NhbXBsZXMoIHJlYWRzID4gMCkgJT4lCiAgc3Vic2V0X3NhbXBsZXModGltZXBvaW50ID09ICJFbnRyeSIpCgpwX3BzCmBgYAoKQ2xhc3MgZGlzdHJpYnV0aW9uIGluIHRoaXMgZGF0YXNldDoKQ2FzZSA9IDE5MwpDb250cm9sID0gNDcKCmBgYHtyfQpzYW1wbGVfZGF0YShwX3BzKSAlPiUKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgYXNfdGliYmxlKCkgJT4lCiAgZ3JvdXBfYnkoZ3JvdXApICU+JQogIHN1bW1hcmlzZShjb3VudCA9IG4oKSkKYGBgCgojIyBEYXRhIHRyYW5zZm9ybQoKYGBge3J9Cm90dV9jbHIgPSBhYnVuZGFuY2VzKHBfcHMsICJjbHIiKSAlPiUgIyByY2xyIHRyYW5zZm9ybSBjb3VsZCBub3QgYmUgcGVyZm9ybWVkLCBnZXQgTmFucwogIHQoKQoKcF9wcyA9IHBoeWxvc2VxKG90dV90YWJsZShvdHVfY2xyLCB0YXhhX2FyZV9yb3dzID0gRkFMU0UpLAogICAgICAgICAgICAgICAgICAgc2FtcGxlX2RhdGEoc2FtcGxlX2RhdGEocF9wcykpLAogICAgICAgICAgICAgICAgICAgdGF4X3RhYmxlKHRheF90YWJsZShwX3BzKSkpCgpybShvdHVfY2xyKQpgYGAKCiMjIEZlYXR1cmUgdGFibGUKCiMjIyBFeHRyYWN0IGRhdGEKCk9yZGVyIG9mIGFzdiBjb2xuYW1lcyBpbiBPVFUgdGFibGUgYW5kIHRheCB0YWJsZSBpcyB0aGUgc2FtZS4KCmBgYHtyfQpmZWF0dXJlcyA9IHBfcHNAb3R1X3RhYmxlICU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICBhc190aWJibGUocm93bmFtZXMgPSBOQSkgJT4lCiAgcm93bmFtZXNfdG9fY29sdW1uKCJzYW1wbGUiKQoKZmVhdHVyZV9sYWJlbHMgPSBjKCJzYW1wbGUiKQoKIG5hbWVzID0gdGF4X3RhYmxlKHBfcHMpICU+JQogICAgICBkYXRhLmZyYW1lKCkgJT4lCiAgICAgIGxvd2VzdF9sZXZlbCgpICU+JQogICBwdWxsKG5hbWUpCgogZmVhdHVyZV9sYWJlbHMgPSBhcHBlbmQoZmVhdHVyZV9sYWJlbHMsIG5hbWVzKQoKIGZlYXR1cmVfbGFiZWxzID0gbWFrZS51bmlxdWUoZmVhdHVyZV9sYWJlbHMpICU+JQogICBtYWtlLm5hbWVzKCkKCiBjb2xuYW1lcyhmZWF0dXJlcykgPSBmZWF0dXJlX2xhYmVscwoKZmVhdHVyZXMKYGBgCiMjIyBBdHRhY2ggZGF0YSBsYWJlbHMKCmBgYHtyfQpzYW1wbGVfbGFiZWxzID0gcF9wc0BzYW1fZGF0YSU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICBhc190aWJibGUocm93bmFtZXMgPSBOQSkgJT4lCiAgcm93bmFtZXNfdG9fY29sdW1uKCJzYW1wbGUiKSAlPiUKICBzZWxlY3Qoc2FtcGxlLCBncm91cCkKCmZlYXR1cmVzID0gc2FtcGxlX2xhYmVscyAlPiUKICBsZWZ0X2pvaW4oZmVhdHVyZXMpCmBgYAoKIyMjIFplcm8gdmFyaWFuY2UgZmVhdHVyZXMKCk5vIGZlYXR1cmVzIGhhdmUgemVybyB2YXJpYW5jZQpgYGB7cn0Kbnp2ID0gZmVhdHVyZXMgJT4lCiAgc2VsZWN0KCFjKHNhbXBsZSxncm91cCkpICU+JQogIG5lYXJaZXJvVmFyKHNhdmVNZXRyaWNzID0gVFJVRSkKYGBgCgpgYGB7cn0Kcm0obnp2KQpgYGAKCiMjIyBDb3JyZWxhdGVkIHByZWRpY3RvcnMKCmBgYHtyfQpmZWF0dXJlX2NvciA9IGZlYXR1cmVzICU+JQogIHNlbGVjdCghYyhzYW1wbGUsZ3JvdXApKSAlPiUKIGNvcigpIAoKc3VtbWFyeShmZWF0dXJlX2Nvclt1cHBlci50cmkoZmVhdHVyZV9jb3IpXSkKYGBgCgpgYGB7cn0KaGlnaF9jb3IgPSBmaW5kQ29ycmVsYXRpb24oZmVhdHVyZV9jb3IsIGN1dG9mZiA9IC43NSkKCmZpbHRlcmVkX2ZlYXR1cmVzID0gZmVhdHVyZXNbLC1oaWdoX2Nvcl0KYGBgCgpgYGB7cn0Kcm0oaGlnaF9jb3IpCnJtKGZlYXR1cmVfY29yKQpgYGAKCiMjIyBMaW5lYXIgZGVwZW5kZW5jaWVzCgpgYGB7cn0KZmVhdHVyZV9sZCA9IGZpbHRlcmVkX2ZlYXR1cmVzICU+JQogIHNlbGVjdCghYyhzYW1wbGUsZ3JvdXApKSAlPiUKICBmaW5kTGluZWFyQ29tYm9zKCkKCmZpbHRlcmVkX2ZlYXR1cmVzID0gZmlsdGVyZWRfZmVhdHVyZXNbLCAtIGZlYXR1cmVfbGQkcmVtb3ZlXQpgYGAKCiMjIElucHV0IHRhYmxlCgpQcmVwcm9jZXNzaW5nIHJlZHVjZXMgY29sdW1uIG51bWJlciB0byAxMzEgZnJvbSAyMzUKCmBgYHtyfQppbnB1dCA9IGZpbHRlcmVkX2ZlYXR1cmVzICU+JQogIG11dGF0ZShncm91cF9zaHVmZmxlID0gc2FtcGxlKGdyb3VwKSAlPiUKICAgICAgICAgICBhcy5mYWN0b3IoKSwKICAgICAgICAgZ3JvdXAgPSBhcy5mYWN0b3IoZ3JvdXApKQpgYGAKCiMgTW9kZWxpbmcKCkxvYWQgZnVuY3Rpb25zIGZyb20gY2xhc3NpZmljYXRpb25fZnVuY3Rpb25fQUEgZmlsZQoKYGBge3J9CmxhYmVsX2xpc3QgPSBjKCJncm91cCIsICJncm91cF9zaHVmZmxlIikKcmVzdWx0cyA9IGxvb3BfbGFiZWwoaXRlcmF0aW9ucyA9MjAsIGlucHV0LCBsYWJlbF9saXN0LCBwZXJjZW50ID0gMC43LCBkb3duc2FtcGxpbmcgPSBUUlVFKQoKYXVjX2RmID0gcmVzdWx0c1tbMl1dCgppbXBvcnRhbmNlX2RmID0gcmVzdWx0c1tbMV1dCmBgYAoKYGBge3J9CnJtKGxhYmVsX2xpc3QpCmBgYAoKIyBQbG90dGluZwoKIyMgQVVDIHBsb3QgYW5kIHQtdGVzdApgYGB7cn0KYXVjX2RmICU+JQogIHBpdm90X2xvbmdlcihjb2xzID0gYygiZ3JvdXAiLCAiZ3JvdXBfc2h1ZmZsZSIpLCBuYW1lc190byA9ICJmZWF0dXJlIiwgdmFsdWVzX3RvID0gIkFVQyIpICU+JQogIGdncGxvdCgpKwogIGdlb21fYm94cGxvdChhZXMoeD1mZWF0dXJlLCB5ID0gQVVDKSkrCiAgdGhlbWVfY2xhc3NpYygpCgp0LnRlc3QoYXVjX2RmJGdyb3VwLCBhdWNfZGYkZ3JvdXBfc2h1ZmZsZSkKCmBgYAoKIyMgSW1wb3J0YW5jZSBwbG90cwoKIyMjIGxvb2stdXAgdGFibGUKYGBge3J9CiNNYWtpbmcgYXN2IHJlZmVyZW5jZSBmb3IgbmFtZXMgaW4gbW9kZWwKbG9va3VwX2FzdiA9IHRheF90YWJsZShwX3BzKSAlPiUgCiAgICAgZGF0YS5mcmFtZSgpICU+JQogICAgICBsb3dlc3RfbGV2ZWwoKSAlPiUKICAgcm93bmFtZXNfdG9fY29sdW1uKCJhc3YiKSAlPiUgCiAgIHNlbGVjdChhc3YsIG5hbWUpICU+JQogICBtdXRhdGUobmFtZSA9IG1ha2UubmFtZXMobWFrZS51bmlxdWUobmFtZSkpKQpgYGAKCiMjIyBzdW1tYXJpemUgaW1wb3J0YW5jZXMKYGBge3J9CiMgTWFraW5nIHN1bW1hcnkgZGYgb2YgaW1wb3J0YW5jZXMKaW1wb3J0YW5jZV9zdW1tYXJ5ID0gaW1wb3J0YW5jZV9kZiAlPiUKICBwaXZvdF9sb25nZXIoY29scyA9IGMoIk5BLiI6IkFwaXVtLmdyYXZlb2xlbnMiKSwgbmFtZXNfdG8gPSAiZm9vZCIsIHZhbHVlc190byA9ICJpbXBvcnRhbmNlIikgJT4lCiAgZ3JvdXBfYnkodmFyaWFibGUsIGZvb2QpICU+JQogIHN1bW1hcmlzZShtZWFuX2ltcG9ydGFuY2UgPSBtZWFuKGltcG9ydGFuY2UpKSAlPiUKICBhcnJhbmdlKHZhcmlhYmxlLCBkZXNjKG1lYW5faW1wb3J0YW5jZSkpCmBgYAojIyMgdG9wIDEwIHJhbmtlZCBsaXN0CmBgYHtyfQojIEdldHRpbmcgdGhlIHJhbmtlZCBsaXN0CnJhbmtpbmcgPSBpbXBvcnRhbmNlX3N1bW1hcnkgJT4lCiAgZmlsdGVyKHZhcmlhYmxlID09ICJncm91cCIpICU+JQogIHB1bGwoZm9vZCkKCnRvcDEwID0gcmFua2luZ1sxOjEwXQpgYGAKCiMjIyBJbXBvcnRhbmNlIHBsb3QKYGBge3J9CmltcG9ydGFuY2VfZGYgJT4lCiAgcGl2b3RfbG9uZ2VyKGNvbHMgPSBjKCJOQS4iOiJBcGl1bS5ncmF2ZW9sZW5zIiksIG5hbWVzX3RvID0gImZvb2QiLCB2YWx1ZXNfdG8gPSAiaW1wb3J0YW5jZSIpICAlPiUKICBmaWx0ZXIodmFyaWFibGUgPT0gImdyb3VwIiAmIGZvb2QgJWluJSB0b3AxMCkgJT4lCiAgZ2dwbG90KCkrCiAgZ2VvbV9ib3hwbG90KGFlcyh4PSByZW9yZGVyKGZvb2QsLWltcG9ydGFuY2UpLCB5ID0gaW1wb3J0YW5jZSkpKwogIGxhYnMoeD0gIkZvb2QiLCB5ID0iSW1wb3J0YW5jZSIpICsKICB0aGVtZV9jbGFzc2ljKCkgKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIGhqdXN0PTEpKQpgYGAKClNvbWUgb2YgdGhlIGltcG9ydGFudCBOQXMgZm9yIHByZWRpY3Rpb25zIG9ubHkgbWFwIHRvIGJhY3RlcmlhbCBzcGVjaWVzIGluIEJMQVNULgoKIyBJbXBvcnRhbnQgdGF4YSBkaXJlY3Rpb25zCgpgYGB7cn0Kc2FtcGxlX2RhdGEocF9wcykgJT4lCiAgZGF0YS5mcmFtZSgpICU+JQogIGZpbHRlcih0aW1lcG9pbnQgPT0gIkVudHJ5IikgJT4lCiAgZ2dwbG90KCkgKwogIGdlb21fYm94cGxvdChhZXMoeD0gZ3JvdXAsIHkgPSBibWkpKSsKICB0aGVtZV9jbGFzc2ljKCkKCnBsb3RfZm9vZF9kaXJlY3Rpb24odG9wMTAsIGlucHV0LCB2YXJpYWJsZSA9ICJncm91cCIpCmBgYAoKCmBgYHtyfQpsb29rdXBfYXN2ICU+JQogIGZpbHRlcihuYW1lID09ICJOQS4iKSAlPiUKICBwdWxsKGFzdikKYGBgCgoKYGBge3J9CmZlYXR1cmVfY29yICU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICBzZWxlY3QoSGVsaWFudGh1cy5hbm51dXMpICU+JQogIGFycmFuZ2UoZGVzYyhIZWxpYW50aHVzLmFubnV1cykpCmBgYAoKCg==